In [1]:
import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

import numpy as np
import pandas as pd

0. Load data¶

In [81]:
df = pd.read_csv('./data/netflix_titles.csv')
df.shape
Out[81]:
(8807, 12)
In [82]:
df = df.dropna( how='any',subset=['cast', 'director'])
df = df.dropna()
In [137]:
# converting into proper date time format
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
In [83]:
df.head()
Out[83]:
show_id type title director cast country date_added release_year rating duration listed_in description
7 s8 Movie Sankofa Haile Gerima Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D... United States, Ghana, Burkina Faso, United Kin... September 24, 2021 1993 TV-MA 125 min Dramas, Independent Movies, International Movies On a photo shoot in Ghana, an American model s...
8 s9 TV Show The Great British Baking Show Andy Devonshire Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho... United Kingdom September 24, 2021 2021 TV-14 9 Seasons British TV Shows, Reality TV A talented batch of amateur bakers face off in...
9 s10 Movie The Starling Theodore Melfi Melissa McCarthy, Chris O'Dowd, Kevin Kline, T... United States September 24, 2021 2021 PG-13 104 min Comedies, Dramas A woman adjusting to life after a loss contend...
12 s13 Movie Je Suis Karl Christian Schwochow Luna Wedler, Jannis Niewöhner, Milan Peschel, ... Germany, Czech Republic September 23, 2021 2021 TV-MA 127 min Dramas, International Movies After most of her family is murdered in a terr...
24 s25 Movie Jeans S. Shankar Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi... India September 21, 2021 1998 TV-14 166 min Comedies, International Movies, Romantic Movies When the father of the man she loves insists t...

1. Line plot¶

In [40]:
x = np.arange(0, 5, 0.1)
def square(x):
    return x**2

def cubic(x):
    return 5 * x**0.5

Plotly themes:

  • 'ggplot2'
  • 'seaborn'
  • 'simple_white'
  • 'plotly'
  • 'plotly_white'
  • 'plotly_dark'
  • 'presentation'
  • 'xgridoff'
  • 'ygridoff'
  • 'gridon'
In [46]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=square(x), mode='lines+markers' , name='f(x)=x<sup>2</sup>')) # use <sup> because LATEX doens't work for hint
fig.add_trace(go.Scatter(x=x, y=x, name='$$g(x)=x$$'))
fig.add_trace(go.Scatter(x=x, y=cubic(x), mode='markers' , name='f(x)=x<sup>3</sup>'))
fig.update_layout(legend_orientation="h", 
                  legend=dict(x=.1, xanchor="center"),
                  title="Plot Title",
                  xaxis_title="x Axis Title",
                  yaxis_title="y Axis Title",
                  template = 'plotly_white',
                  margin=dict(l=5, r=5, t=30, b=20) # by default 20 px
                 )
fig.show()

1. Scatter plot¶

In [42]:
np.random.seed(0)
x_var = np.random.normal(size = 6000)
y_var = np.random.normal(size = 6000)

norm_data = pd.DataFrame({'x_var':x_var
                         ,'y_var':y_var}
                         )

norm_data = norm_data.assign(category_var = np.where(x_var > 1, "Category A","Category B"))
In [52]:
fig = px.scatter(data_frame = norm_data
                 ,x = 'x_var'
                 ,y = 'y_var'
                 ,color = 'category_var'
                 ,opacity = .2,
                )
fig.update_layout(title="Scatter Plot",
                  xaxis_title="x Axis Title",
                  yaxis_title="y Axis Title",
                  template = 'plotly_white',
                  margin=dict(l=5, r=5, t=30, b=20),
                  width=800, height=400
                 )
fig.show()

2. Subplots¶

In [59]:
fig = make_subplots(rows=2, cols=2,
                    specs=[[{"rowspan": 2}, {}], [None, {}]])


fig.update_yaxes(range=[-0.5, 1.5], zeroline=True, zerolinewidth=2, zerolinecolor='LightPink', col=2)
fig.update_xaxes(range=[-0.5, 1.5], zeroline=True, zerolinewidth=2, zerolinecolor='#008000', col=2)

fig.add_trace(go.Scatter(x=x, y=np.sin(x),  name='sin(x)'), 2, 2)
fig.add_trace(go.Scatter(x=x, y=np.cos(x),  name='cos(x)'), 2, 2)
fig.add_trace(go.Scatter(x=x, y=np.tan(x),  name='tg(x)'), 1, 1)

fig.add_trace(go.Scatter(x=x, y=square(x), mode='lines+markers',  name='f(x)=x<sup>2</sup>'), 1, 2)
fig.add_trace(go.Scatter(x=x, y=x, mode='markers',name='g(x)=x',
                         marker=dict(color='LightSkyBlue', size=10, line=dict(color='MediumPurple', width=3))), 1, 2)
fig.update_layout(legend_orientation="h",
                  legend=dict(x=.5, xanchor="center"),
                  hovermode="x",
                  margin=dict(l=0, r=0, t=0, b=0))
fig.update_traces(hoverinfo="all", hovertemplate="Value: %{x}<br>Function: %{y}")
fig.show()

3. Bar plots¶

In [84]:
df_rating = pd.DataFrame(df['rating'].value_counts()).reset_index().rename(columns={'index':'rating','rating':'count'})
df_rating.head()
Out[84]:
rating count
0 TV-MA 1822
1 TV-14 1214
2 R 778
3 PG-13 470
4 TV-PG 431

3.0. Simple bar plot¶

In [85]:
fig_bar = px.bar(df_rating, y='rating', x='count', 
                 title='Distribution of Rating',
                 # color_discrete_sequence=['#b20710'], 
                 text='count'
                )
fig_bar.update_xaxes(visible=False)
fig_bar.update_yaxes(showgrid=False, 
                     categoryorder='total ascending', 
                     ticksuffix='  ', 
                     showline=False)
fig_bar.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_bar.update_layout(margin=dict(t=40, b=0, l=0, r=0),
                      hovermode="y unified", 
                      xaxis_title=' ', yaxis_title=" ",
                      title_font=dict(size=25),
                      legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                     )

3.1. Stacked Bar Plot¶

In [94]:
fig_stack_bar = px.histogram(df, y='rating', color='type', title='Which has the highest Rating TV shows or Movies?',
                             color_discrete_sequence=['#b20710', 'orange'])
fig_stack_bar.update_xaxes(visible=False)
fig_stack_bar.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix='  ', showline=False)
fig_stack_bar.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_stack_bar.update_layout(margin=dict(t=70, b=0, l=0, r=0),
                            hovermode="y unified", 
                            xaxis_title=' ', yaxis_title=" ",
                            title_font=dict(size=25),
                            legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                           )

3.2. Group Bar Chart¶

In [95]:
fig_group_bar = px.histogram(df, y='rating', color='type', title='Which has the highest Rating TV shows or Movies?',
                             color_discrete_sequence=['#b20710', 'orange'], barmode='group')
fig_group_bar.update_xaxes(showgrid=False)
fig_group_bar.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix='  ', showline=False)
fig_group_bar.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_group_bar.update_layout(margin=dict(t=70, b=0, l=0, r=0),
                            hovermode="y unified", 
                            xaxis_title=' ', yaxis_title=" ",
                            title_font=dict(size=25),
                            legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                           )

3.3. Bidirectional Bar Plot¶

In [134]:
# making a copy of df
dff = df.copy()

# making 2 df one for tv show and another for movie with rating 
df_tv_show = dff[dff['type']=='TV Show'][['rating', 'type']].rename(columns={'type':'tv_show'})
df_movie = dff[dff['type']=='Movie'][['rating', 'type']].rename(columns={'type':'movie'})

df_tv_show = pd.DataFrame(df_tv_show.rating.value_counts()).reset_index().rename(columns={'index':'tv_show'})
df_tv_show['rating_final'] = df_tv_show['rating'] 
# making rating column value negative
df_tv_show['rating'] *= -1

df_movie = pd.DataFrame(df_movie.rating.value_counts()).reset_index().rename(columns={'index':'movie'})
df_movie = df_movie.sort_values(by=['rating']) # sort for order on plot
In [135]:
fig = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_yaxes=True, horizontal_spacing=0)
# bar plot for movies
fig.append_trace(go.Bar(x=df_movie.rating, y=df_movie.movie, orientation='h', showlegend=True, text=df_movie.rating,
                        name='Movie', marker_color='#b20710'), 1, 2)
# bar plot for tv shows
fig.append_trace(go.Bar(x=df_tv_show.rating, y=df_tv_show.tv_show, orientation='h', showlegend=True, 
                        text=df_tv_show.rating_final, name='TV Show', marker_color='#221f1f'), 1, 1)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
fig.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig.update_layout(title='Which has the highest rating TV shows or Movies?',
                  margin=dict(t=80, b=0, l=0, r=0),
                  hovermode="y unified", 
                  xaxis_title=' ', yaxis_title=" ",
                  title_font=dict(size=25),
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                 )

4. Waterfall Chart¶

In [210]:
df2 = df[df["type"] == "Movie"]
df3 = d2["year_added"].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
df3 = df3.sort_values(col)
df3['diff'] = df3['count'].diff()
df3['diff'] = df3['diff'].fillna(0).apply(int)
df3 = df3.reset_index(drop=True)
In [211]:
fig2 = go.Figure(go.Waterfall(
    name = "Movie", orientation = "v", 
    x = df3.year_added.apply(str).to_list(),
    textposition = "auto",
    text = df3['diff'].abs().apply(str).to_list(),
    y = df3['diff'].to_list(),
    increasing = {"marker":{"color":"green"}},
    decreasing = {"marker":{"color":"red"}},
))
fig2.update_xaxes(showgrid=False)
fig2.update_yaxes(showgrid=False, visible=False)
fig2.update_traces(hovertemplate=None)
fig2.update_layout(title='Added movies by year', height=350,
                   margin=dict(t=60, b=0, l=0, r=0),
                   hovermode="x unified",
                   xaxis_title=' ', yaxis_title=" ",
                   title_font=dict(size=25),
                  )
In [212]:
df3['measure'] = None
df3.loc[0, 'measure'] = 'absolute'
# df3.loc[df3.shape[0] - 1, 'measure'] = 'total'
df3['measure'] = df3['measure'].fillna('relative')
df3
Out[212]:
year_added count diff measure
0 2008 1 0 absolute
1 2009 2 1 relative
2 2010 1 -1 relative
3 2011 13 12 relative
4 2012 3 -10 relative
5 2013 6 3 relative
6 2014 14 8 relative
7 2015 47 33 relative
8 2016 195 148 relative
9 2017 702 507 relative
10 2018 1085 383 relative
11 2019 1236 151 relative
12 2020 1151 -85 relative
13 2021 729 -422 relative
In [213]:
fig2 = go.Figure(go.Waterfall(
    name = "Movie", orientation="v", 
    x = df3.year_added.apply(str).to_list() + ['Total'],
    textposition="auto",
    text = df3['diff'].abs().apply(str).to_list(),
    y = df3['diff'].to_list() + [None],
    increasing={"marker":{"color":"green"}},
    decreasing={"marker":{"color":"red"}},
    measure=df3['measure'].to_list() + ['total']
))
fig2.update_xaxes(showgrid=False)
fig2.update_yaxes(showgrid=False, visible=False)
fig2.update_traces(hovertemplate=None)
fig2.update_layout(title='Added movies by year', height=350,
                   margin=dict(t=60, b=0, l=0, r=0),
                   hovermode="x unified",
                   xaxis_title=' ', yaxis_title=" ",
                   title_font=dict(size=25),
                  )

5. Sankey diagram¶

5.0. Simple example¶

In [215]:
# data
label = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE"]
source = [0, 0, 1, 1, 0] # index from 
target = [2, 3, 4, 5, 4] # index to 
value = [8, 2, 2, 8, 4]
# data to dict, dict to sankey
link = dict(source = source, target = target, value = value)
node = dict(label = label, pad=50, thickness=5)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()

5.1. Example with clients history¶

In [316]:
clients = pd.DataFrame({'clientId': range(1, 20), 'id': 1})
dates = pd.DataFrame({'date': range(2017, 2023), 'id': 1})
df = dates.merge(clients)
df = df.drop(['id'], axis=1)
In [317]:
types = {
    0: 'New',
    1: 'Not active',
    2: 'Good',
    3: 'Bad',
}
In [318]:
# Create clients type history for each date
df['type'] = None
df.loc[(df.date == 2017) & (df.clientId.isin([i for i in range(1, 11)])), 'type'] = 0
df.loc[(df.date == 2017) & (df.clientId.isin([i for i in range(11, 20)])), 'type'] = 1
df.loc[(df.date == 2018) & (df.clientId.isin([i for i in range(1, 4)])), 'type'] = 2
df.loc[(df.date == 2018) & (df.clientId.isin([i for i in range(4, 20)])), 'type'] = 1
df.loc[(df.date == 2019) & (df.clientId.isin([i for i in range(1, 6)])), 'type'] = 2
df.loc[(df.date == 2019) & (df.clientId.isin([i for i in range(6, 10)])), 'type'] = 3
df.loc[(df.date == 2019) & (df.clientId.isin([i for i in range(10, 20)])), 'type'] = 1
df.loc[(df.date == 2020) & (df.clientId.isin([i for i in range(1, 20)])), 'type'] = 1
df.loc[(df.date == 2021) & (df.clientId.isin([i for i in range(1, 8)])), 'type'] = 2
df.loc[(df.date == 2021) & (df.clientId.isin([i for i in range(8, 10)])), 'type'] = 3
df.loc[(df.date == 2021) & (df.clientId.isin([i for i in range(10, 20)])), 'type'] = 1
df.loc[(df.date == 2022) & (df.clientId.isin([i for i in range(1, 10)])), 'type'] = 1
df.loc[(df.date == 2022) & (df.clientId.isin([i for i in range(10, 20)])), 'type'] = 2
In [319]:
# add next year type
df['next_date'] = df['date'] + 1
df2 = df.merge(df.loc[:, ['date', 'type', 'clientId']].rename({'date': 'next_date', 'type': 'type_next'}, axis=1), 
               how='left', on=['next_date', 'clientId'])
df2.head()
Out[319]:
date clientId type next_date type_next
0 2017 1 0 2018 2
1 2017 2 0 2018 2
2 2017 3 0 2018 2
3 2017 4 0 2018 1
4 2017 5 0 2018 1
In [320]:
# Create column that has date and type
df2['type_with_date'] = df2['date'].apply(str) + '_' + df2['type'].apply(str)
df2['type_next_with_date'] = df2['next_date'].apply(str) + '_' + df2['type_next'].apply(str)
df2 = df2.loc[~df2['type_next'].isna()].copy()
df2.head()
Out[320]:
date clientId type next_date type_next type_with_date type_next_with_date
0 2017 1 0 2018 2 2017_0 2018_2
1 2017 2 0 2018 2 2017_0 2018_2
2 2017 3 0 2018 2 2017_0 2018_2
3 2017 4 0 2018 1 2017_0 2018_1
4 2017 5 0 2018 1 2017_0 2018_1
In [321]:
# Calculate count of each transition
counts = df2.loc[:, ['date', 'type_with_date', 'type_next_with_date']].value_counts()
df_counts = pd.DataFrame(counts)\
    .reset_index()\
    .rename({0: 'cnt'}, axis=1)\
    .sort_values(by=['date'])
df_counts
Out[321]:
date type_with_date type_next_with_date cnt
4 2017 2017_1 2018_1 9
5 2017 2017_0 2018_1 7
11 2017 2017_0 2018_2 3
0 2018 2018_1 2019_1 10
9 2018 2018_1 2019_3 4
12 2018 2018_2 2019_2 3
13 2018 2018_1 2019_2 2
1 2019 2019_1 2020_1 10
8 2019 2019_2 2020_1 5
10 2019 2019_3 2020_1 4
2 2020 2020_1 2021_1 10
6 2020 2020_1 2021_2 7
14 2020 2020_1 2021_3 2
3 2021 2021_1 2022_2 10
7 2021 2021_2 2022_1 7
15 2021 2021_3 2022_1 2
In [322]:
# Get unique id for every transition
all_types = df_counts['type_with_date'].to_list() + df_counts['type_next_with_date'].to_list()
all_types = set(all_types) # drop duplicates
all_types = sorted(list(all_types)) 
d = {}
for i in range(len(all_types)):
    d[all_types[i]] = i 
d
Out[322]:
{'2017_0': 0,
 '2017_1': 1,
 '2018_1': 2,
 '2018_2': 3,
 '2019_1': 4,
 '2019_2': 5,
 '2019_3': 6,
 '2020_1': 7,
 '2021_1': 8,
 '2021_2': 9,
 '2021_3': 10,
 '2022_1': 11,
 '2022_2': 12}
In [325]:
# Give unique id for each transition (for plotly)
df_counts['type_final'] = df_counts['type_with_date'].apply(lambda x: d[x])
df_counts['type_next_final'] = df_counts['type_next_with_date'].apply(lambda x: d[x])
df_counts.head()
Out[325]:
date type_with_date type_next_with_date cnt type_final type_next_final
4 2017 2017_1 2018_1 9 1 2
5 2017 2017_0 2018_1 7 0 2
11 2017 2017_0 2018_2 3 0 3
0 2018 2018_1 2019_1 10 2 4
9 2018 2018_1 2019_3 4 2 6
In [326]:
source = df_counts['type_final'].to_list()
target = df_counts['type_next_final'].to_list()
value = df_counts['cnt'].to_list()
labels = list(d.keys())
In [327]:
# data to dict, dict to sankey
link = dict(source = source, target = target, value = value)
node = dict(label=labels, pad=15, thickness=5)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()

5. Do before export to html¶

In [328]:
plotly.offline.init_notebook_mode()